1 Bibliotecas

library(data.table)
library(dplyr)
library(ggplot2)
library(grid)
library(plotly)
library(tibble)
library(stringr)

2 Importando os dados

wine_dset <- read.csv2('BaseWine_Red_e_White2018.csv')

2.1 Uma visão rápida no dataset

glimpse(wine_dset)
## Observations: 6,497
## Variables: 14
## $ id_vinho           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ...
## $ fixedacidity       <dbl> 6.6, 6.7, 10.6, 5.4, 6.7, 6.8, 6.6, 7.2, 5....
## $ volatileacidity    <dbl> 0.240, 0.340, 0.310, 0.180, 0.300, 0.500, 0...
## $ citricacid         <dbl> 0.35, 0.43, 0.49, 0.24, 0.44, 0.11, 0.00, 0...
## $ residualsugar      <dbl> 7.70, 1.60, 2.20, 4.80, 18.75, 1.50, 1.60, ...
## $ chlorides          <dbl> 0.031, 0.041, 0.063, 0.041, 0.057, 0.075, 0...
## $ freesulfurdioxide  <dbl> 36, 29, 18, 30, 65, 16, 4, 34, 46, 58, 54, ...
## $ totalsulfurdioxide <dbl> 135, 114, 40, 113, 224, 49, 8, 102, 113, 18...
## $ density            <dbl> 0.99380, 0.99014, 0.99760, 0.99445, 0.99956...
## $ pH                 <dbl> 3.19, 3.23, 3.14, 3.42, 3.11, 3.36, 3.33, 3...
## $ sulphates          <dbl> 0.37, 0.44, 0.51, 0.40, 0.53, 0.79, 0.37, 0...
## $ alcohol            <dbl> 10.50, 12.60, 9.80, 9.40, 9.10, 9.50, 10.40...
## $ quality            <int> 5, 6, 6, 6, 5, 5, 4, 6, 7, 6, 5, 6, 6, 6, 6...
## $ Vinho              <fct> WHITE, WHITE, RED, WHITE, WHITE, RED, RED, ...

2.2 Tamanho do dataset dividido pelo tipo de vinho

wine_dset %>% 
  group_by(Vinho) %>% 
  count()
## # A tibble: 2 x 2
## # Groups:   Vinho [2]
##   Vinho     n
##   <fct> <int>
## 1 RED    1599
## 2 WHITE  4898

2.3 Integridade dos dados por tipo de vinho

wine_white_dset <- 
  wine_dset %>%
  filter(Vinho == 'WHITE')

wine_red_dset <- 
  wine_dset %>% 
  filter(Vinho == 'RED')

sum(is.na(wine_white_dset))
## [1] 0
sum(is.na(wine_red_dset))
## [1] 0

O vinho branco e vermelho apresentam características diferentes que definem se ele é bom ou ruim. Vamos dar uma olhada nos dados pra cada característica:

2.4 Característica de cada tipo de vinho

2.4.1 Sumário dos dados

Vinho Branco

summary(wine_white_dset)
##     id_vinho     fixedacidity    volatileacidity    citricacid    
##  Min.   :   1   Min.   : 3.800   Min.   :0.0800   Min.   :0.0000  
##  1st Qu.:1650   1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700  
##  Median :3310   Median : 6.800   Median :0.2600   Median :0.3200  
##  Mean   :3284   Mean   : 6.855   Mean   :0.2782   Mean   :0.3342  
##  3rd Qu.:4932   3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3900  
##  Max.   :6497   Max.   :14.200   Max.   :1.1000   Max.   :1.6600  
##  residualsugar      chlorides       freesulfurdioxide totalsulfurdioxide
##  Min.   : 0.600   Min.   :0.00900   Min.   :  2.00    Min.   :  9.0     
##  1st Qu.: 1.700   1st Qu.:0.03600   1st Qu.: 23.00    1st Qu.:108.0     
##  Median : 5.200   Median :0.04300   Median : 34.00    Median :134.0     
##  Mean   : 6.387   Mean   :0.04577   Mean   : 35.31    Mean   :138.4     
##  3rd Qu.: 9.900   3rd Qu.:0.05000   3rd Qu.: 46.00    3rd Qu.:167.0     
##  Max.   :45.800   Max.   :0.34600   Max.   :289.00    Max.   :440.0     
##     density             pH          sulphates         alcohol     
##  Min.   :0.9871   Min.   :2.720   Min.   :0.2200   Min.   : 8.00  
##  1st Qu.:0.9917   1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50  
##  Median :0.9937   Median :3.180   Median :0.4700   Median :10.40  
##  Mean   :0.9940   Mean   :3.188   Mean   :0.4898   Mean   :10.51  
##  3rd Qu.:0.9961   3rd Qu.:3.280   3rd Qu.:0.5500   3rd Qu.:11.40  
##  Max.   :1.0140   Max.   :3.820   Max.   :1.0800   Max.   :14.20  
##     quality        Vinho     
##  Min.   :3.000   RED  :   0  
##  1st Qu.:5.000   WHITE:4898  
##  Median :6.000               
##  Mean   :5.878               
##  3rd Qu.:6.000               
##  Max.   :9.000

Vinho Vermelho

summary(wine_red_dset)
##     id_vinho     fixedacidity   volatileacidity    citricacid   
##  Min.   :   3   Min.   : 4.60   Min.   :0.1200   Min.   :0.000  
##  1st Qu.:1523   1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090  
##  Median :3103   Median : 7.90   Median :0.5200   Median :0.260  
##  Mean   :3141   Mean   : 8.32   Mean   :0.5278   Mean   :0.271  
##  3rd Qu.:4690   3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420  
##  Max.   :6490   Max.   :15.90   Max.   :1.5800   Max.   :1.000  
##  residualsugar      chlorides       freesulfurdioxide totalsulfurdioxide
##  Min.   : 0.900   Min.   :0.01200   Min.   : 1.00     Min.   :  6.00    
##  1st Qu.: 1.900   1st Qu.:0.07000   1st Qu.: 7.00     1st Qu.: 22.00    
##  Median : 2.200   Median :0.07900   Median :14.00     Median : 38.00    
##  Mean   : 2.539   Mean   :0.08747   Mean   :15.87     Mean   : 46.47    
##  3rd Qu.: 2.600   3rd Qu.:0.09000   3rd Qu.:21.00     3rd Qu.: 62.00    
##  Max.   :15.500   Max.   :0.61100   Max.   :72.00     Max.   :289.00    
##     density             pH          sulphates         alcohol       
##  Min.   :0.9901   Min.   :2.740   Min.   :0.3300   Min.   : 0.9567  
##  1st Qu.:0.9956   1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.5000  
##  Median :0.9968   Median :3.310   Median :0.6200   Median :10.2000  
##  Mean   :0.9967   Mean   :3.311   Mean   :0.6581   Mean   :10.4001  
##  3rd Qu.:0.9978   3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.1000  
##  Max.   :1.0037   Max.   :4.010   Max.   :2.0000   Max.   :14.9000  
##     quality        Vinho     
##  Min.   :3.000   RED  :1599  
##  1st Qu.:5.000   WHITE:   0  
##  Median :6.000               
##  Mean   :5.636               
##  3rd Qu.:6.000               
##  Max.   :8.000

2.4.1.1 Diferença da mediana das características

Medianas - Tinto dataset

## fixedacidity

# wine_dset %>% 
#   ggplot(aes(fixedacidity, fill = Vinho)) +
#   geom_boxplot()

# wine_Cmedian

# ou
# > sapply(teste_matrix, median)

# ou 
# > sort(teste_matrix$ColunaB)

median_red_dset <-  sapply(select(wine_red_dset, -c(Vinho, id_vinho, quality)), median)


median_red_dset <- as.data.frame(median_red_dset)
`colnames<-`(median_red_dset, "Median")
##                      Median
## fixedacidity        7.90000
## volatileacidity     0.52000
## citricacid          0.26000
## residualsugar       2.20000
## chlorides           0.07900
## freesulfurdioxide  14.00000
## totalsulfurdioxide 38.00000
## density             0.99675
## pH                  3.31000
## sulphates           0.62000
## alcohol            10.20000

Medianas - Branco dataset

median_white_dset <-  sapply(select(wine_white_dset, -c(Vinho, id_vinho, quality)),median)

median_white_dset <- as.data.frame(median_white_dset)
`colnames<-`(median_white_dset, "Median")
##                       Median
## fixedacidity         6.80000
## volatileacidity      0.26000
## citricacid           0.32000
## residualsugar        5.20000
## chlorides            0.04300
## freesulfurdioxide   34.00000
## totalsulfurdioxide 134.00000
## density              0.99374
## pH                   3.18000
## sulphates            0.47000
## alcohol             10.40000

Diferença das medianas entre os dois tipos de vinhos

Cmedian_differenc <- abs(median_white_dset - median_red_dset)
Cmedian_differenc <- `colnames<-`(Cmedian_differenc, "Mediana_Difer")
Cmedian_differenc <- rownames_to_column(Cmedian_differenc)
Cmedian_differenc %>%
  arrange(desc(Mediana_Difer)) %>%
  rename(Caracteristica = rowname)
##        Caracteristica Mediana_Difer
## 1  totalsulfurdioxide      96.00000
## 2   freesulfurdioxide      20.00000
## 3       residualsugar       3.00000
## 4        fixedacidity       1.10000
## 5     volatileacidity       0.26000
## 6             alcohol       0.20000
## 7           sulphates       0.15000
## 8                  pH       0.13000
## 9          citricacid       0.06000
## 10          chlorides       0.03600
## 11            density       0.00301

Nota: a ordem descendente dessas características será utilizada nos plots para uma melhor visualização:

2.4.1.2 Histogramas de cada característica

Multiplot function

# Define multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#

multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])
  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}
plot_ly(wine_dset, y = ~totalsulfurdioxide,type = "box",
              color = ~Vinho, colors = c("red", "khaki")) %>% 
      layout(title = "Total sulfurdioxide")
plot_ly(wine_dset, y = ~freesulfurdioxide,type = "box",
              color = ~Vinho, colors = c("red", "khaki")) %>% 
      layout(title = "Total sulfurdioxide")
plot_ly(wine_dset, y = ~residualsugar,type = "box",
              color = ~Vinho, colors = c("red", "khaki")) %>% 
      layout(title = "Total sulfurdioxide")
plot_ly(wine_dset, y = ~fixedacidity,type = "box",
              color = ~Vinho, colors = c("red", "khaki")) %>% 
      layout(title = "Total sulfurdioxide")

Notando a media de cada característica, vemos uma diferença considerável pra cada vinho (Branco ou Vermelho), portanto, vamos pegar somente um tipo de vinho para uma análise coerente.

Como o número de dados pros vinhos brancos é bem maior do que o para os vermelhos (aprox. 3 vezes maior), seria interessante utilizar os dados que oferecem mais amostras pra treinar e validar nosso modelo.